library(tidyverse)
library(data.table)
library(leaflet)
library(ggplot2)Assignment-1
Step 1
Read in the data and conduct EDA checklist items 2-4
# Read in the data using data.table()
setwd("~/Desktop/PM566/Assignment1")
pm2.5_2002 <- data.table::fread("ad_viz_plotval_data_2002.csv")
pm2.5_2022 <- data.table::fread("ad_viz_plotval_data_2022.csv")
# check the dimensions, headers, footers, variable names and variable types
# For data of 2002
# dimensions
dim(pm2.5_2002)[1] 15976 20
# headers
head(pm2.5_2002) Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 01/05/2002 AQS 60010007 1 25.1 ug/m3 LC
2: 01/06/2002 AQS 60010007 1 31.6 ug/m3 LC
3: 01/08/2002 AQS 60010007 1 21.4 ug/m3 LC
4: 01/11/2002 AQS 60010007 1 25.9 ug/m3 LC
5: 01/14/2002 AQS 60010007 1 34.5 ug/m3 LC
6: 01/17/2002 AQS 60010007 1 41.0 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 78 Livermore 1 100
2: 92 Livermore 1 100
3: 71 Livermore 1 100
4: 80 Livermore 1 100
5: 98 Livermore 1 100
6: 115 Livermore 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 41860
2: 88101 PM2.5 - Local Conditions 41860
3: 88101 PM2.5 - Local Conditions 41860
4: 88101 PM2.5 - Local Conditions 41860
5: 88101 PM2.5 - Local Conditions 41860
6: 88101 PM2.5 - Local Conditions 41860
CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
SITE_LATITUDE SITE_LONGITUDE
1: 37.68753 -121.7842
2: 37.68753 -121.7842
3: 37.68753 -121.7842
4: 37.68753 -121.7842
5: 37.68753 -121.7842
6: 37.68753 -121.7842
# footers
tail(pm2.5_2002) Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 12/10/2002 AQS 61131003 1 15 ug/m3 LC
2: 12/13/2002 AQS 61131003 1 15 ug/m3 LC
3: 12/22/2002 AQS 61131003 1 1 ug/m3 LC
4: 12/25/2002 AQS 61131003 1 23 ug/m3 LC
5: 12/28/2002 AQS 61131003 1 5 ug/m3 LC
6: 12/31/2002 AQS 61131003 1 6 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 57 Woodland-Gibson Road 1 100
2: 57 Woodland-Gibson Road 1 100
3: 4 Woodland-Gibson Road 1 100
4: 74 Woodland-Gibson Road 1 100
5: 21 Woodland-Gibson Road 1 100
6: 25 Woodland-Gibson Road 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 40900
2: 88101 PM2.5 - Local Conditions 40900
3: 88101 PM2.5 - Local Conditions 40900
4: 88101 PM2.5 - Local Conditions 40900
5: 88101 PM2.5 - Local Conditions 40900
6: 88101 PM2.5 - Local Conditions 40900
CBSA_NAME STATE_CODE STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
COUNTY SITE_LATITUDE SITE_LONGITUDE
1: Yolo 38.66121 -121.7327
2: Yolo 38.66121 -121.7327
3: Yolo 38.66121 -121.7327
4: Yolo 38.66121 -121.7327
5: Yolo 38.66121 -121.7327
6: Yolo 38.66121 -121.7327
# variable names and types
str(pm2.5_2002)Classes 'data.table' and 'data.frame': 15976 obs. of 20 variables:
$ Date : chr "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
$ Source : chr "AQS" "AQS" "AQS" "AQS" ...
$ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
$ POC : int 1 1 1 1 1 1 1 1 1 1 ...
$ Daily Mean PM2.5 Concentration: num 25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
$ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
$ DAILY_AQI_VALUE : int 78 92 71 80 98 115 87 57 65 107 ...
$ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
$ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
$ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
$ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
$ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
$ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
$ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
$ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
$ STATE : chr "California" "California" "California" "California" ...
$ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
$ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
$ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
- attr(*, ".internal.selfref")=<externalptr>
# For data in 2022
# dimensions
dim(pm2.5_2022)[1] 56140 20
# headers
head(pm2.5_2022) Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 01/01/2022 AQS 60010007 3 12.7 ug/m3 LC
2: 01/02/2022 AQS 60010007 3 13.9 ug/m3 LC
3: 01/03/2022 AQS 60010007 3 7.1 ug/m3 LC
4: 01/04/2022 AQS 60010007 3 3.7 ug/m3 LC
5: 01/05/2022 AQS 60010007 3 4.2 ug/m3 LC
6: 01/06/2022 AQS 60010007 3 3.8 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 52 Livermore 1 100
2: 55 Livermore 1 100
3: 30 Livermore 1 100
4: 15 Livermore 1 100
5: 18 Livermore 1 100
6: 16 Livermore 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 41860
2: 88101 PM2.5 - Local Conditions 41860
3: 88101 PM2.5 - Local Conditions 41860
4: 88101 PM2.5 - Local Conditions 41860
5: 88101 PM2.5 - Local Conditions 41860
6: 88101 PM2.5 - Local Conditions 41860
CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
SITE_LATITUDE SITE_LONGITUDE
1: 37.68753 -121.7842
2: 37.68753 -121.7842
3: 37.68753 -121.7842
4: 37.68753 -121.7842
5: 37.68753 -121.7842
6: 37.68753 -121.7842
# footers
tail(pm2.5_2022) Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 12/01/2022 AQS 61131003 1 3.4 ug/m3 LC
2: 12/07/2022 AQS 61131003 1 3.8 ug/m3 LC
3: 12/13/2022 AQS 61131003 1 6.0 ug/m3 LC
4: 12/19/2022 AQS 61131003 1 34.8 ug/m3 LC
5: 12/25/2022 AQS 61131003 1 23.2 ug/m3 LC
6: 12/31/2022 AQS 61131003 1 1.0 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 14 Woodland-Gibson Road 1 100
2: 16 Woodland-Gibson Road 1 100
3: 25 Woodland-Gibson Road 1 100
4: 99 Woodland-Gibson Road 1 100
5: 74 Woodland-Gibson Road 1 100
6: 4 Woodland-Gibson Road 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 40900
2: 88101 PM2.5 - Local Conditions 40900
3: 88101 PM2.5 - Local Conditions 40900
4: 88101 PM2.5 - Local Conditions 40900
5: 88101 PM2.5 - Local Conditions 40900
6: 88101 PM2.5 - Local Conditions 40900
CBSA_NAME STATE_CODE STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
COUNTY SITE_LATITUDE SITE_LONGITUDE
1: Yolo 38.66121 -121.7327
2: Yolo 38.66121 -121.7327
3: Yolo 38.66121 -121.7327
4: Yolo 38.66121 -121.7327
5: Yolo 38.66121 -121.7327
6: Yolo 38.66121 -121.7327
# variable names and types
str(pm2.5_2022)Classes 'data.table' and 'data.frame': 56140 obs. of 20 variables:
$ Date : chr "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
$ Source : chr "AQS" "AQS" "AQS" "AQS" ...
$ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
$ POC : int 3 3 3 3 3 3 3 3 3 3 ...
$ Daily Mean PM2.5 Concentration: num 12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
$ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
$ DAILY_AQI_VALUE : int 52 55 30 15 18 16 10 29 54 47 ...
$ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
$ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
$ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
$ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
$ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
$ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
$ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
$ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
$ STATE : chr "California" "California" "California" "California" ...
$ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
$ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
$ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
- attr(*, ".internal.selfref")=<externalptr>
# check for any data issues
summary(pm2.5_2002$`Daily Mean PM2.5 Concentration`) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.00 12.00 16.12 20.50 104.30
summary(pm2.5_2022$`Daily Mean PM2.5 Concentration`) Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.20 4.20 6.90 8.52 10.80 302.50
It dose not make sense to have a daily mean PM2.5 concentration to be -2.2. So I decide to remove all the parts that PM2.5 < 0.
pm2.5_2022 <- pm2.5_2022[`Daily Mean PM2.5 Concentration` >= 0,]
summary(pm2.5_2022$`Daily Mean PM2.5 Concentration`) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.200 7.000 8.554 10.800 302.500
Summary: The mean of daily mean PM2.5 concentration in 2022 is 8.554 ug/m3 LC, which is lower than 16.12 ug/m3 LC in 2002. This may give us a possible sign that daily PM2.5 concentration have decreased in California over the last 20 years.
Step 2
Combine the two years of data into one data frame. Use the Date variable to create a new column for year, which will serve as an identifier. Change the names of the key variables so that they are easier to refer to in your code.
# Create new column of 'year'
pm2.5_2002 <- pm2.5_2002[, year := 2002]
pm2.5_2022 <- pm2.5_2022[, year := 2022]
# Combine two years of data into one
dat_pm2.5 <- rbind(pm2.5_2002, pm2.5_2022)
# Change the names of key variables
setnames(dat_pm2.5,c("Daily Mean PM2.5 Concentration", "SITE_LATITUDE", "SITE_LONGITUDE", "Site Name"), c("pm2.5", "lat", "lon", "site"))Step 3
Create a basic map in leaflet() that shows the locations of the sites (make sure to use different colors for each year). Summarize the spatial distribution of the monitoring sites.
leaflet(dat_pm2.5) %>%
addTiles() %>%
addCircles(
lng = ~lon,
lat = ~lat,
color = ~ifelse(year == 2002, "red", "blue"),
weight = 2,
opacity =1,
fillOpacity = 1,
radius = 100,
popup = ~site,
label = "Map of Sites Measured in 2002(red) and 2022 (blue)")The map shows that the number of blue spots(2022) is much more than the number of red spots(2002). And also the blue spots are more wide-spread than red spots.
Step 4
Check for any missing or implausible values of PM2.5 in the combined dataset. Explore the proportions of each and provide a summary of any temporal patterns you see in these observations.
# Check for missing value
sum(is.na(dat_pm2.5$pm2.5))[1] 0
# Check for implausible values
summary(dat_pm2.5$pm2.5) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 4.60 7.70 10.23 12.40 302.50
Step 5
Explore the main question of interest at three different spatial levels. Create exploratory plots (e.g. boxplots, histograms, line plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.
State Level
# Boxplot
dat_pm2.5[!is.na(year)] %>%
ggplot() +
geom_boxplot(mapping = aes(x = year, y = pm2.5, group = year))# Barcharts
average_pm <- dat_pm2.5 %>%
group_by(year) %>%
summarize(avg_pm = mean(pm2.5, na.rm = TRUE))
average_pm %>%
ggplot() +
geom_bar(mapping = aes(x = year, y = stat(average_pm$avg_pm) , group = 1))Warning: `stat(average_pm$avg_pm)` was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(average_pm$avg_pm)` instead.
# Summary statistics
tapply(dat_pm2.5$pm2.5, dat_pm2.5$year, summary)$`2002`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.00 12.00 16.12 20.50 104.30
$`2022`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.200 7.000 8.554 10.800 302.500
t.test(pm2.5_2002$`Daily Mean PM2.5 Concentration`, pm2.5_2022$`Daily Mean PM2.5 Concentration`, paired = FALSE)
Welch Two Sample t-test
data: pm2.5_2002$`Daily Mean PM2.5 Concentration` and pm2.5_2022$`Daily Mean PM2.5 Concentration`
t = 66.136, df = 18805, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
7.337922 7.786156
sample estimates:
mean of x mean of y
16.115943 8.553904
From the result, there was a statistically significant decrease in daily mean PM2.5 concentration from 2002 to 2022.
County Level
pm2.5_County = dat_pm2.5 %>%
filter(dat_pm2.5$COUNTY %in%
intersect(pm2.5_2002$COUNTY, pm2.5_2022$COUNTY))
County = group_by(pm2.5_County, year, COUNTY) %>%
summarize(`pm2.5` = mean(`pm2.5`, na.rm = TRUE), .groups = "drop")
Daily_PM2.5_Concentration <- (County$`pm2.5`)
# Boxplot
County %>%
ggplot() +
geom_boxplot(mapping = aes(x = year, y = pm2.5, group = COUNTY))average_pm_by_county_02 <- dat_pm2.5[dat_pm2.5$year == 2002, ] %>%
group_by(COUNTY) %>%
summarize(
average_pm_2002 = mean(pm2.5, na.rm = TRUE)
)
average_pm_by_county_22 <- dat_pm2.5[dat_pm2.5$year == 2022, ] %>%
group_by(COUNTY) %>%
summarize(
average_pm_2022 = mean(pm2.5, na.rm = TRUE)
)
t.test(average_pm_by_county_02$average_pm_2002, average_pm_by_county_22$average_pm_2022, paired = FALSE)
Welch Two Sample t-test
data: average_pm_by_county_02$average_pm_2002 and average_pm_by_county_22$average_pm_2022
t = 4.841, df = 60.526, p-value = 9.299e-06
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
2.582337 6.218018
sample estimates:
mean of x mean of y
12.23913 7.83895
From the result, there was a statistically significant decrease in daily mean PM2.5 concentration by county from 2002 to 2022.
Site Level
average_pm_by_site_02 <- dat_pm2.5[dat_pm2.5$year == 2002, ] %>%
group_by(site) %>%
summarize(
average_site_pm_2002 = mean(pm2.5, na.rm = TRUE),
Year = mean(year),
Lat = mean(lat),
Lon = mean(lon)
)
average_pm_by_site_22 <- dat_pm2.5[dat_pm2.5$year == 2022, ] %>%
group_by(site) %>%
summarize(
average_site_pm_2022 = mean(pm2.5, na.rm = TRUE),
Year = mean(year),
Lat = mean(lat),
Lon = mean(lon)
)
# leaflet maps
Site_mean <- rbindlist(list(
average_pm_by_site_02,
average_pm_by_site_22),fill = TRUE)
color_palette <- colorNumeric(
palette = "viridis",
domain = Site_mean$average_pm_2002_site
)
temp.pal02 <- colorNumeric(c('pink','deeppink','deeppink4'), domain=average_pm_by_site_02$average_site_pm_2002)
leaflet02 <- leaflet(average_pm_by_site_02) %>%
addProviderTiles('CartoDB.Positron') %>%
addCircles(
lat = ~Lat, lng=~Lon,
label = ~paste0(round(average_pm_by_site_02$average_site_pm_2002,2), ' PM2.5'), color = ~ temp.pal02(average_pm_by_site_02$average_site_pm_2002),
opacity = 1, fillOpacity = 1, radius = 500
) %>%
addLegend('bottomleft', pal=temp.pal02, values=average_pm_by_site_02$average_site_pm_2002,
title='Mean Concentrations PM2.5 by site in 2002', opacity=1)
leaflet22 <- leaflet(average_pm_by_site_22) %>%
addProviderTiles('CartoDB.Positron') %>%
addCircles(
lat = ~Lat, lng=~Lon,
label = ~paste0(round(average_pm_by_site_22$average_site_pm_2022,2), ' PM2.5'), color = ~ temp.pal02(average_pm_by_site_02$average_site_pm_2002),
opacity = 1, fillOpacity = 1, radius = 500
) %>%
addLegend('bottomleft', pal=temp.pal02, values=average_pm_by_site_02$average_site_pm_2002,
title='Mean Concentrations PM2.5 by site in 2022', opacity=1)
leaflet02leaflet22# t-test
t.test(average_pm_by_site_02$average_site_pm_2002, average_pm_by_site_22$average_site_pm_2022, paired = FALSE)
Welch Two Sample t-test
data: average_pm_by_site_02$average_site_pm_2002 and average_pm_by_site_22$average_site_pm_2022
t = 7.2463, df = 126.67, p-value = 3.71e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
3.732122 6.536277
sample estimates:
mean of x mean of y
13.211227 8.077028
From the result, there was a statistically significant decrease in the daily average PM2.5 concentrations by site from 2002 to 2022.